In [ ]:

    
import logging
import requests
from pprint import pprint
from requests import RequestException
from os import path
from bs4 import BeautifulSoup

from html2corpus import HTML2Corpus
from html2corpus.extractors import ReadabilityExtractor, ParagraphExtractor

def check(link, blackwords):
    return all([blackword not in link for blackword in blackwords])

Crawling german political party sites

Die Linke

News from the website of the party: http://www.die-linke.de/nc/die-linke/nachrichten



In [ ]:

    
domain = 'http://www.die-linke.de'
keyword = 'artikel'

site = 'http://www.die-linke.de/nc/die-linke/nachrichten'
pages = ['{}/browse/{}'.format(site, i) for i in range(1, 99)]
pages.append(site)

def get_data():
    for page in pages:
        try:
            req = requests.get(page, timeout=10)
            soup = BeautifulSoup(req.content)
            links = set([link['href'] for link in soup.findAll('a')
                     if keyword in link['href']])
            links = map(lambda x: '{}/{}'.format(domain, x), links)
            for article in list(links):
                article_req = requests.get(article)
                yield article_req.content
        except RequestException as error:
            logging.error('Error: %s', error)
            
HTML2Corpus(get_data(), extractor=ReadabilityExtractor(min_len=100)).save(path.join('data', 'Corpus_DieLinke.txt'))

Press releases from the website of the party: http://www.die-linke.de/nc/presse/presseerklaerungen/presseerklaerungen



In [ ]:

    
domain = 'http://www.die-linke.de'
keyword = 'artikel'
site = 'http://www.die-linke.de/nc/presse/presseerklaerungen/presseerklaerungen'
pages = ['{}/browse/{}'.format(site, i) for i in range(1, 272)]
pages.append(site)

def get_data():
    for page in pages:
        try:
            req = requests.get(page, timeout=10)
            soup = BeautifulSoup(req.content)
            links = set([link['href'] for link in soup.findAll('a')
                     if keyword in link['href']])
            links = map(lambda x: '{}/{}'.format(domain, x), links)
            for article in list(links):
                article_req = requests.get(article)
                yield article_req.content
        except RequestException as error:
            logging.error('Error: %s', error)
            
HTML2Corpus(get_data(), extractor=ReadabilityExtractor(min_len=100)).save(path.join('data', 'Corpus_DieLinke_PR.txt'))

Press releases from the faction: http://www.linksfraktion.de/pressemitteilungen



In [ ]:

    
domain = 'http://www.linksfraktion.de'
keyword = 'pressemitteilungen'
site = 'http://www.linksfraktion.de/pressemitteilungen'
pages = ['{}/?s={}'.format(site, i) for i in range(1, 1384)]
pages.append(site)

def get_data():
    for page in pages:
        try:
            req = requests.get(page, timeout=10)
            soup = BeautifulSoup(req.content)
            links = set([link['href'] for link in soup.findAll('a')
                     if link.get('href') and keyword in link['href']])
            links = map(lambda x: '{}/{}'.format(domain, x), links)
            for article in list(links):
                article_req = requests.get(article)
                yield article_req.content
        except RequestException as error:
            logging.error('Error: %s', error)
            
HTML2Corpus(get_data(), extractor=ReadabilityExtractor(min_len=100)).save(path.join('data', 'Corpus_DieLinke_Fraktion.txt'))

SPD

Press releases from the website of the faction: http://www.spdfraktion.de/presse/pressemitteilungen



In [ ]:

    
domain = 'http://www.spdfraktion.de'
keyword = 'presse/pressemitteilungen/'
blackword = 'feed'
site = 'http://www.spdfraktion.de/presse/pressemitteilungen'
pages = ['{}?page={}'.format(site, i) for i in range(1, 733)]
pages.append(site)

def get_data():
    for page in pages:
        try:
            req = requests.get(page, timeout=10)
            soup = BeautifulSoup(req.content)
            links = set([link['href'] for link in soup.findAll('a')
                     if keyword in link['href'] and blackword not in link['href']])
            links = map(lambda x: '{}/{}'.format(domain, x), links)
            for article in list(links):
                article_req = requests.get(article)
                yield article_req.content
        except RequestException as error:
            logging.error('Error: %s', error)
            
HTML2Corpus(get_data(), extractor=ReadabilityExtractor(min_len=100)).save(path.join('data', 'Corpus_SPD_Fraktion.txt'))

Press releases from the website of the eu faction: https://www.spd-europa.de/pressemitteilung



In [ ]:

    
domain = 'https://www.spd-europa.de'
keyword = '/pressemitteilungen/'
site = 'https://www.spd-europa.de/pressemitteilung'
pages = ['{}?page={}'.format(site, i) for i in range(1, 165)]
pages.append(site)

def get_data():
    for page in pages:
        try:
            req = requests.get(page, timeout=10)
            soup = BeautifulSoup(req.content, 'lxml')
            links = set([link['href'] for link in soup.findAll('a')
                     if keyword in link['href']])
            links = map(lambda x: '{}{}'.format(domain, x), links)
            for article in list(links):
                article_req = requests.get(article)
                yield article_req.content
        except RequestException as error:
            logging.error('Error: %s', error)
            
HTML2Corpus(get_data(), extractor=ParagraphExtractor(min_len=150)).save(path.join('data', 'SPD_EU.txt'))

Vorwärts, a spd news paper.



In [ ]:

    
domain = 'http://www.vorwaerts.de'
keyword = '/artikel/'
blackwords = ['#comment-form']
site = 'http://www.vorwaerts.de/international'
pages = ['{}?page={}'.format(site, i) for i in range(1, 124)]
pages.append(site)

def get_data():
    for page in pages:
        try:
            req = requests.get(page, timeout=10)
            soup = BeautifulSoup(req.content, 'lxml')
            links = [link['href'] for link in soup.findAll('a')
                        if link.get('href', None) and keyword in link['href'] and check(link['href'], blackwords)]
            links = map(lambda x: '{}{}'.format(domain, x), links)
            for article in list(links):
                article_req = requests.get(article)
                yield article_req.content
        except RequestException as error:
            logging.error('Error: %s', error)
            
HTML2Corpus(get_data(), extractor=ParagraphExtractor(min_len=100)).save(path.join('data', 'SPD_Vorwärts.txt'))

Grüne

Press releases from the website of the faction: http://www.gruene-bundestag.de/presse_ID_2000127



In [ ]:

    
domain = 'http://www.spdfraktion.de'
keyword = 'presse/pressemitteilungen/'
blackword = 'feed'
site = 'http://www.spdfraktion.de/presse/pressemitteilungen'
pages = ['{}?page={}'.format(site, i) for i in range(1, 733)]
pages.append(site)

def get_data():
    for page in pages:
        try:
            req = requests.get(page, timeout=10)
            soup = BeautifulSoup(req.content)
            links = set([link['href'] for link in soup.findAll('a')
                     if keyword in link['href'] and blackword not in link['href']])
            links = map(lambda x: '{}/{}'.format(domain, x), links)
            for article in list(links):
                article_req = requests.get(article)
                yield article_req.content
        except RequestException as error:
            logging.error('Error: %s', error)
            
HTML2Corpus(get_data(), extractor=ReadabilityExtractor(min_len=100)).save(path.join('data', 'Corpus_SPD_Fraktion.txt'))



In [ ]:

    
domain = 'http://www.gruene-bundestag.de'
keyword = 'presse/pressemitteilungen/'
blackword = 'feed'
site = 'http://www.gruene-bundestag.de/presse_ID_2000127'
pages = ['{}/pb_id/100/seite/{}'.format(site, i) for i in range(2, 1322)]
pages.append(site)

def get_data():
    for page in pages:
        try:
            req = requests.get(page, timeout=10)
            soup = BeautifulSoup(req.content)
            links = set([link['href'] for link in soup.findAll('a')
                     if keyword in link['href'] and blackword not in link['href']])
            links = map(lambda x: '{}/{}'.format(domain, x), links)
            for article in list(links):
                article_req = requests.get(article)
                yield article_req.content
        except RequestException as error:
            logging.error('Error: %s', error)
            
HTML2Corpus(get_data(), extractor=ReadabilityExtractor(min_len=100)).save(path.join('data', 'Corpus_Grüne_Fraktion.txt'))

FDP

News from the website of the party: http://www.fdp.de/pressemitteilungen



In [ ]:

    
domain = 'http://www.fdp.de'
keyword = '/content/'
blackwords = set(['datenschutz', 'impressum'])
site = 'http://www.fdp.de/pressemitteilungen'
pages = ['{}?page={}'.format(site, i) for i in range(1, 97)]
pages.append(site)

def get_data():
    for page in pages:
        try:
            req = requests.get(page, timeout=10)
            soup = BeautifulSoup(req.content)
            links = set([link['href'] for link in soup.findAll('a')
                     if keyword in link['href'] and check(link['href'], blackwords)])
            links = map(lambda x: '{}/{}'.format(domain, x), links)
            for article in list(links):
                article_req = requests.get(article)
                yield article_req.content
        except RequestException as error:
            logging.error('Error: %s', error)
            
HTML2Corpus(get_data(), extractor=ParagraphExtractor(min_len=100)).save(path.join('data', 'Corpus_FDP.txt'))

PR from the faction: http://www.liberale.de/page/pressemitteilungen



In [ ]:

    
domain = 'http://www.liberale.de'
keyword = '/content/'
blackwords = set(['datenschutz', 'impressum'])
site = 'http://www.liberale.de/page/pressemitteilungen'
pages = ['{}?page=0%2C{}'.format(site, i) for i in range(1, 1063)]
pages.append(site)

def get_data():
    for page in pages:
        try:
            req = requests.get(page, timeout=10)
            soup = BeautifulSoup(req.content)
            links = set([link['href'] for link in soup.findAll('a')
                     if keyword in link['href'] and check(link['href'], blackwords)])
            links = map(lambda x: '{}/{}'.format(domain, x), links)
            for article in list(links):
                article_req = requests.get(article)
                yield article_req.content
        except RequestException as error:
            logging.error('Error: %s', error)
            
HTML2Corpus(get_data(), extractor=ParagraphExtractor(min_len=100)).save(path.join('data', 'Corpus_FDP_Fraktion.txt'))

CDU

CDU/CSU Fraktion



In [ ]:

    
domain = 'http://www.presseportal.de'
keyword = '/pm/7846/'
site = 'http://www.presseportal.de/nr/7846'
pages = ['{}/{}'.format(site, i * 27) for i in range(1, 621)]
pages.append(site)

def get_data():
    for page in pages:
        try:
            req = requests.get(page, timeout=10)
            soup = BeautifulSoup(req.content)
            links = set([link['href'] for link in soup.findAll('a')
                     if keyword in link['href']])
            links = map(lambda x: '{}/{}'.format(domain, x), links)
            for article in list(links):
                article_req = requests.get(article)
                yield article_req.content
        except RequestException as error:
            logging.error('Error: %s', error)
            
HTML2Corpus(get_data(), extractor=ParagraphExtractor(min_len=150)).save(path.join('data', 'CDU_Fraktion.txt'))



In [ ]:

    
domain = 'http://www.presseportal.de'
keyword = '/pm/6518/'
site = 'http://www.presseportal.de/nr/6518'
pages = ['{}/{}'.format(site, i * 27) for i in range(1, 38)]
pages.append(site)

def get_data():
    for page in pages:
        try:
            req = requests.get(page, timeout=10)
            soup = BeautifulSoup(req.content)
            links = set([link['href'] for link in soup.findAll('a')
                     if keyword in link['href']])
            links = map(lambda x: '{}/{}'.format(domain, x), links)
            for article in list(links):
                article_req = requests.get(article)
                yield article_req.content
        except RequestException as error:
            logging.error('Error: %s', error)
            
HTML2Corpus(get_data(), extractor=ParagraphExtractor(min_len=150)).save(path.join('data', 'CDU.txt'))



In [ ]:

    
domain = 'http://www.cdu-csu-ep.de'
keyword = '/presse/pressemitteilungen/'
blackwords = set(['content'])
site = 'http://www.cdu-csu-ep.de/pressearchiv.html'
pages = ['{}?start={}'.format(site, i * 5) for i in range(0, 643)]

def get_data():
    for page in pages:
        try:
            req = requests.get(page, timeout=10)
            soup = BeautifulSoup(req.content)
            links = set([link['href'] for link in soup.findAll('a')
                     if link.get('href', None) and keyword in link['href'] and check(link['href'], blackwords)])
            links = map(lambda x: '{}/{}'.format(domain, x), links)
            for article in list(links):
                article_req = requests.get(article)
                yield article_req.content
        except RequestException as error:
            logging.error('Error: %s', error)
            
HTML2Corpus(get_data(), extractor=ParagraphExtractor(min_len=150)).save(path.join('data', 'CDU_EU.txt'))

NPD



In [ ]:

    
keyword = '/?p='
site = 'http://aktion-widerstand.de/?page_id=11042'
pages = ['{}&paged={}'.format(site, i) for i in range(2, 335)]
pages.append(site)

def get_data():
    for page in pages:
        try:
            req = requests.get(page, timeout=10)
            soup = BeautifulSoup(req.content)
            links = set([link['href'] for link in soup.findAll('a')
                     if keyword in link['href']])
            for article in list(links):
                article_req = requests.get(article)
                yield article_req.content
        except RequestException as error:
            logging.error('Error: %s', error)
            
HTML2Corpus(get_data(), extractor=ParagraphExtractor(min_len=100)).save(path.join('data', 'Corpus_NPD_Jung.txt'))



In [ ]:

    
domain = 'http://www.npd-fraktion-mv.de'
keyword = '&view=article&'
blackwords = set(['content'])
site = 'http://www.npd-fraktion-mv.de/index.php?com=news&view=archive'
pages = ['{}&b={}&mid=8'.format(site, i * 50) for i in range(0, 38)]

def get_data():
    for page in pages:
        try:
            req = requests.get(page, timeout=10)
            soup = BeautifulSoup(req.content)
            links = set([link['href'] for link in soup.findAll('a')
                     if keyword in link['href'] and check(link['href'], blackwords)])
            links = map(lambda x: '{}/{}'.format(domain, x), links)
            for article in list(links):
                article_req = requests.get(article)
                yield article_req.content
        except RequestException as error:
            logging.error('Error: %s', error)
            
HTML2Corpus(get_data(), extractor=ReadabilityExtractor(min_len=100)).save(path.join('data', 'Corpus_NPD_MV.txt'))



In [ ]:

    
domain = 'http://www.npd-fraktion-sachsen.de'
blackwords = set(['meldungen', 'category', 'author'])
site = 'http://www.npd-fraktion-sachsen.de/category/meldungen'
pages = ['{}/page/{}'.format(site, i) for i in range(2, 194)]

def get_data():
    for page in pages:
        try:
            req = requests.get(page, timeout=10)
            soup = BeautifulSoup(req.content)
            blog = soup.find('div', id='blog-left')
            links = set([link['href'] for link in blog.findAll('a')
                        if check(link['href'], blackwords)])
            for article in list(links):
                article_req = requests.get(article)
                yield article_req.content
        except RequestException as error:
            logging.error('Error: %s', error)
            
HTML2Corpus(get_data(), extractor=ReadabilityExtractor(min_len=100)).save(path.join('data', 'Corpus_NPD_Sachsen.txt'))